import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Enable autocompletion
%config Completer.use_jedi = True
import os
os.listdir(r'C:\Users\kbsha\Downloads\Leeds\Reading List\Python\Uber\Datasets')
['other-American_B01362.csv', 'other-Carmel_B00256.csv', 'other-Dial7_B00887.csv', 'other-Diplo_B01196.csv', 'other-Federal_02216.csv', 'other-FHV-services_jan-aug-2015.csv', 'other-Firstclass_B01536.csv', 'other-Highclass_B01717.csv', 'other-Lyft_B02510.csv', 'other-Prestige_B01338.csv', 'other-Skyline_B00111.csv', 'Uber-Jan-Feb-FOIL.csv', 'uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
uber_data = pd.read_csv(r'C:\Users\kbsha\Downloads\Leeds\Reading List\Python\Uber\Datasets\uber-raw-data-janjune-15_sample.csv')
uber_data.shape
(100000, 4)
# check for duplicate rows, if corresponding row return false then its not duplicate else it is.
uber_data.duplicated()
0 False
1 False
2 False
3 False
4 False
...
99995 False
99996 False
99997 False
99998 False
99999 False
Length: 100000, dtype: bool
# As we can see it has 54 duplicate rows
uber_data.duplicated().sum()
54
# to remove duplicate rows and replace it in the main file, use the following code
uber_data.drop_duplicates(inplace=True)
# Now, the dataset has no duplicate rows
uber_data.duplicated().sum()
0
uber_data.shape
(99946, 4)
# to check the datatype use the below code
uber_data.dtypes
Dispatching_base_num object Pickup_date object Affiliated_base_num object locationID int64 dtype: object
# Now let us check the dataset has null values, as we can see the below code result Affiliated_base_num as 1116 null values
uber_data.isnull().sum()
Dispatching_base_num 0 Pickup_date 0 Affiliated_base_num 1116 locationID 0 dtype: int64
uber_data['Pickup_date']
0 2015-05-02 21:43:00
1 2015-01-20 19:52:59
2 2015-03-19 20:26:00
3 2015-04-10 17:38:00
4 2015-03-23 07:03:00
...
99995 2015-04-13 16:12:00
99996 2015-03-06 21:32:00
99997 2015-03-19 19:56:00
99998 2015-05-02 16:02:00
99999 2015-06-24 16:04:00
Name: Pickup_date, Length: 99946, dtype: object
# The code result shows that Pickup_date has string data type, we shall convert it into datatime datatype
uber_data['Pickup_date'] = pd.to_datetime(uber_data['Pickup_date'])
type(uber_data['Pickup_date'])
pandas.core.series.Series
uber_data.dtypes
Dispatching_base_num object Pickup_date datetime64[ns] Affiliated_base_num object locationID int64 dtype: object
'''From the below code we can understand that the dataset do not have month column to find the max
pickups but we have pickup data column, we shall extract the required columns form Pickups column
'''
uber_data
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | |
|---|---|---|---|---|
| 0 | B02617 | 2015-05-02 21:43:00 | B02764 | 237 |
| 1 | B02682 | 2015-01-20 19:52:59 | B02682 | 231 |
| 2 | B02617 | 2015-03-19 20:26:00 | B02617 | 161 |
| 3 | B02764 | 2015-04-10 17:38:00 | B02764 | 107 |
| 4 | B02764 | 2015-03-23 07:03:00 | B00111 | 140 |
| ... | ... | ... | ... | ... |
| 99995 | B02764 | 2015-04-13 16:12:00 | B02764 | 234 |
| 99996 | B02764 | 2015-03-06 21:32:00 | B02764 | 24 |
| 99997 | B02598 | 2015-03-19 19:56:00 | B02598 | 17 |
| 99998 | B02682 | 2015-05-02 16:02:00 | B02682 | 68 |
| 99999 | B02764 | 2015-06-24 16:04:00 | B02764 | 125 |
99946 rows × 4 columns
uber_data['Pickup_date'].dt.month_name()
uber_data['month'] = uber_data['Pickup_date'].dt.month_name()
uber_data
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | month | |
|---|---|---|---|---|---|
| 0 | B02617 | 2015-05-02 21:43:00 | B02764 | 237 | May |
| 1 | B02682 | 2015-01-20 19:52:59 | B02682 | 231 | January |
| 2 | B02617 | 2015-03-19 20:26:00 | B02617 | 161 | March |
| 3 | B02764 | 2015-04-10 17:38:00 | B02764 | 107 | April |
| 4 | B02764 | 2015-03-23 07:03:00 | B00111 | 140 | March |
| ... | ... | ... | ... | ... | ... |
| 99995 | B02764 | 2015-04-13 16:12:00 | B02764 | 234 | April |
| 99996 | B02764 | 2015-03-06 21:32:00 | B02764 | 24 | March |
| 99997 | B02598 | 2015-03-19 19:56:00 | B02598 | 17 | March |
| 99998 | B02682 | 2015-05-02 16:02:00 | B02682 | 68 | May |
| 99999 | B02764 | 2015-06-24 16:04:00 | B02764 | 125 | June |
99946 rows × 5 columns
# the dataset has month column, now check the frequency of each month using below code
uber_data['month'].value_counts() # its in desc order, month June has highest pickups
June 19620 May 18660 April 15982 March 15969 February 15896 January 13819 Name: month, dtype: int64
# Visulaize it using Line plot method
uber_data['month'].value_counts().plot()
<Axes: >
# Visulaize it using bar plot
uber_data['month'].value_counts().plot(kind='bar')
<Axes: >
''' The bar chart shows that June has highest and January has the least. If we want to go indept, we can also find which
day has highest and lowest, but for this we need data in the tabular format - cross tabulation'''
# In order to create pivot table, let us derive attributes from pickup date
uber_data['weekday'] = uber_data['Pickup_date'].dt.day_name()
uber_data['day'] = uber_data['Pickup_date'].dt.day
uber_data['hour'] = uber_data['Pickup_date'].dt.hour
uber_data['minute'] = uber_data['Pickup_date'].dt.minute
uber_data
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | month | weekday | day | hour | minute | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | B02617 | 2015-05-02 21:43:00 | B02764 | 237 | May | Saturday | 2 | 21 | 43 |
| 1 | B02682 | 2015-01-20 19:52:59 | B02682 | 231 | January | Tuesday | 20 | 19 | 52 |
| 2 | B02617 | 2015-03-19 20:26:00 | B02617 | 161 | March | Thursday | 19 | 20 | 26 |
| 3 | B02764 | 2015-04-10 17:38:00 | B02764 | 107 | April | Friday | 10 | 17 | 38 |
| 4 | B02764 | 2015-03-23 07:03:00 | B00111 | 140 | March | Monday | 23 | 7 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 99995 | B02764 | 2015-04-13 16:12:00 | B02764 | 234 | April | Monday | 13 | 16 | 12 |
| 99996 | B02764 | 2015-03-06 21:32:00 | B02764 | 24 | March | Friday | 6 | 21 | 32 |
| 99997 | B02598 | 2015-03-19 19:56:00 | B02598 | 17 | March | Thursday | 19 | 19 | 56 |
| 99998 | B02682 | 2015-05-02 16:02:00 | B02682 | 68 | May | Saturday | 2 | 16 | 2 |
| 99999 | B02764 | 2015-06-24 16:04:00 | B02764 | 125 | June | Wednesday | 24 | 16 | 4 |
99946 rows × 9 columns
# use month as index or row and use weekday as column and store it in pivot object
pivot = pd.crosstab(index=uber_data['month'], columns=uber_data['weekday'])
pivot
| weekday | Friday | Monday | Saturday | Sunday | Thursday | Tuesday | Wednesday |
|---|---|---|---|---|---|---|---|
| month | |||||||
| April | 2365 | 1833 | 2508 | 2052 | 2823 | 1880 | 2521 |
| February | 2655 | 1970 | 2550 | 2183 | 2396 | 2129 | 2013 |
| January | 2508 | 1353 | 2745 | 1651 | 2378 | 1444 | 1740 |
| June | 2793 | 2848 | 3037 | 2485 | 2767 | 3187 | 2503 |
| March | 2465 | 2115 | 2522 | 2379 | 2093 | 2388 | 2007 |
| May | 3262 | 1865 | 3519 | 2944 | 2627 | 2115 | 2328 |
# In order to show day wise, let us crete group bar chart
pivot.plot(kind='bar', figsize=(8,6))
<Axes: xlabel='month'>
To find the trend we do not have count of weekday on hour basis, like COUNT of pikeups on friday at hour 1, hour 2... till hour 24
Previoulsy we have derived attributes like weekday and hour.. using group by function, pass weekday and hour as arguments find the total count.
uber_data.groupby(['weekday', 'hour']).size()
# If we want this datafram set as_index=False and assign to summary
summary= uber_data.groupby(['weekday', 'hour'], as_index=False).size()
summary
| weekday | hour | size | |
|---|---|---|---|
| 0 | Friday | 0 | 581 |
| 1 | Friday | 1 | 333 |
| 2 | Friday | 2 | 197 |
| 3 | Friday | 3 | 138 |
| 4 | Friday | 4 | 161 |
| ... | ... | ... | ... |
| 163 | Wednesday | 19 | 1044 |
| 164 | Wednesday | 20 | 897 |
| 165 | Wednesday | 21 | 949 |
| 166 | Wednesday | 22 | 900 |
| 167 | Wednesday | 23 | 669 |
168 rows × 3 columns
# Using pointplot, we can easily display the trend of the each weekday on hour basis, from seanborn module, call pointplot,
plt.figure(figsize=(10,6))
sns.pointplot(x='hour', y='size', hue='weekday', data=summary)
<Axes: xlabel='hour', ylabel='size'>
By examining the pointplot, it becomes apparent that pickups generally increase during the evening throughout the entire week. Notably, on Saturday and Friday, pickups consistently rise progressively into the late night hours, following a similar pattern observed on Thursday, albeit with a slight decrease in pickups during the late night period.
# We need to import another dataset for this activity
uber_foil = pd.read_csv(r'C:\Users\kbsha\Downloads\Leeds\Reading List\Python\Uber\Datasets\Uber-Jan-Feb-FOIL.csv')
uber_foil
| dispatching_base_number | date | active_vehicles | trips | |
|---|---|---|---|---|
| 0 | B02512 | 1/1/2015 | 190 | 1132 |
| 1 | B02765 | 1/1/2015 | 225 | 1765 |
| 2 | B02764 | 1/1/2015 | 3427 | 29421 |
| 3 | B02682 | 1/1/2015 | 945 | 7679 |
| 4 | B02617 | 1/1/2015 | 1228 | 9537 |
| ... | ... | ... | ... | ... |
| 349 | B02764 | 2/28/2015 | 3952 | 39812 |
| 350 | B02617 | 2/28/2015 | 1372 | 14022 |
| 351 | B02682 | 2/28/2015 | 1386 | 14472 |
| 352 | B02512 | 2/28/2015 | 230 | 1803 |
| 353 | B02765 | 2/28/2015 | 747 | 7753 |
354 rows × 4 columns
# null values
uber_foil.isnull().sum()
dispatching_base_number 0 date 0 active_vehicles 0 trips 0 dtype: int64
# Duplicate values
uber_foil.duplicated().sum()
0
uber_foil.columns
Index(['dispatching_base_number', 'date', 'active_vehicles', 'trips'], dtype='object')
# Let us us use box plot from seanborn module to show the distribution of active vehicles
plt.figure(figsize=(10,6))
sns.boxplot(x='dispatching_base_number', y='active_vehicles', data=uber_foil)
# Adding a summary table
summary_table = uber_foil.groupby('dispatching_base_number')['active_vehicles'].describe()
summary_table = summary_table.reset_index() # Reset index for better display
summary_table.columns = ['Dispatching Base Number', 'Count', 'Mean', 'Std', 'Min', '25%', '50%', '75%', 'Max']
# Displaying the summary table
print("Summary of Active Vehicles per Dispatching Base Number:")
print(summary_table)
plt.show()
Summary of Active Vehicles per Dispatching Base Number:
Dispatching Base Number Count Mean Std Min 25% \
0 B02512 59.0 222.457627 33.423264 112.0 203.5
1 B02598 59.0 994.118644 134.303579 434.0 936.5
2 B02617 59.0 1351.830508 161.360290 596.0 1274.5
3 B02682 59.0 1210.694915 190.817773 600.0 1088.0
4 B02764 59.0 3682.881356 438.326444 1619.0 3488.5
5 B02765 59.0 382.627119 180.135476 135.0 250.0
50% 75% Max
0 230.0 245.0 281.0
1 1011.0 1077.0 1216.0
2 1367.0 1456.5 1590.0
3 1235.0 1338.0 1523.0
4 3734.0 3955.5 4395.0
5 296.0 528.5 786.0
files = os.listdir(r'C:\Users\kbsha\Downloads\Leeds\Reading List\Python\Uber\Datasets')
files
['other-American_B01362.csv', 'other-Carmel_B00256.csv', 'other-Dial7_B00887.csv', 'other-Diplo_B01196.csv', 'other-Federal_02216.csv', 'other-FHV-services_jan-aug-2015.csv', 'other-Firstclass_B01536.csv', 'other-Highclass_B01717.csv', 'other-Lyft_B02510.csv', 'other-Prestige_B01338.csv', 'other-Skyline_B00111.csv', 'Uber-Jan-Feb-FOIL.csv', 'uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
files = files[-8:]
files
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
files.remove('uber-raw-data-janjune-15.csv')
files.remove('uber-raw-data-janjune-15_sample.csv')
# There are 6 files, let us combine all 6 files into one single file
files
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
# Create one blank datafram
final = pd.DataFrame()
# Using for loop concatnate all files into final file
path = r'C:\Users\kbsha\Downloads\Leeds\Reading List\Python\Uber\Datasets'
for file in files:
cdf=pd.read_csv(path + '/' + file)
final = pd.concat([final, cdf])
final.shape
(4534327, 4)
final.drop_duplicates(inplace=True)
final.shape
(4451746, 4)
final.head(4)
# Sections running from top to bottom = Lon (360 degree)
# Sections running from west to east = Lat (180 degree)
| Date/Time | Lat | Lon | Base | |
|---|---|---|---|---|
| 0 | 4/1/2014 0:11:00 | 40.7690 | -73.9549 | B02512 |
| 1 | 4/1/2014 0:17:00 | 40.7267 | -74.0345 | B02512 |
| 2 | 4/1/2014 0:21:00 | 40.7316 | -73.9873 | B02512 |
| 3 | 4/1/2014 0:28:00 | 40.7588 | -73.9776 | B02512 |
# This can be achieved using mapbased visulaisation
uber_rush = final.groupby(['Lat','Lon'], as_index=False).size()
uber_rush
| Lat | Lon | size | |
|---|---|---|---|
| 0 | 39.6569 | -74.2258 | 1 |
| 1 | 39.6686 | -74.1607 | 1 |
| 2 | 39.7214 | -74.2446 | 1 |
| 3 | 39.8416 | -74.1512 | 1 |
| 4 | 39.9055 | -74.0791 | 1 |
| ... | ... | ... | ... |
| 574553 | 41.3730 | -72.9237 | 1 |
| 574554 | 41.3737 | -73.7988 | 1 |
| 574555 | 41.5016 | -72.8987 | 1 |
| 574556 | 41.5276 | -72.7734 | 1 |
| 574557 | 42.1166 | -72.0666 | 1 |
574558 rows × 3 columns
!pip install folium
Requirement already satisfied: folium in c:\users\kbsha\anaconda3\lib\site-packages (0.15.1) Requirement already satisfied: branca>=0.6.0 in c:\users\kbsha\anaconda3\lib\site-packages (from folium) (0.7.0) Requirement already satisfied: numpy in c:\users\kbsha\anaconda3\lib\site-packages (from folium) (1.23.5) Requirement already satisfied: xyzservices in c:\users\kbsha\anaconda3\lib\site-packages (from folium) (2023.10.1) Requirement already satisfied: jinja2>=2.9 in c:\users\kbsha\anaconda3\lib\site-packages (from folium) (3.1.2) Requirement already satisfied: requests in c:\users\kbsha\anaconda3\lib\site-packages (from folium) (2.28.1) Requirement already satisfied: MarkupSafe>=2.0 in c:\users\kbsha\anaconda3\lib\site-packages (from jinja2>=2.9->folium) (2.1.1) Requirement already satisfied: idna<4,>=2.5 in c:\users\kbsha\anaconda3\lib\site-packages (from requests->folium) (3.4) Requirement already satisfied: certifi>=2017.4.17 in c:\users\kbsha\anaconda3\lib\site-packages (from requests->folium) (2023.5.7) Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\kbsha\anaconda3\lib\site-packages (from requests->folium) (1.26.14) Requirement already satisfied: charset-normalizer<3,>=2 in c:\users\kbsha\anaconda3\lib\site-packages (from requests->folium) (2.0.4)
import folium
# Let us pass this world coordination dataset to map function from folim package
basemap = folium.Map()
basemap
from folium.plugins import HeatMap
HeatMap(uber_rush).add_to(basemap)
<folium.plugins.heat_map.HeatMap at 0x29d3abcb940>
# The brid spot indicates that rush
basemap